/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: December 2024

Inputs: 	pdb_full_structures.dta
			pdb_full_papers.dta
		
Outputs: 	pdb_analysis.dta

Purpose: 	Define the analysis sample, pdb_analysis. 
			This sample is unique at the structure level.

*******************************************************************************/
	
	
* Load in structure-level data
clear all
use "${data_built}pdb_full_structures.dta"

* Merge in citations
merge m:1 pubmedId using "${data_built}pdb_full_papers.dta", 			///
	keepusing(stk_cites_nslf3 lastAuthorId)
assert _merge != 2
drop _merge
	
	
* Log sample restrictions
cap log close
log using "${data_built}sample_restrictions.log", replace
	
* Sample restricitions - all at the structure level

	* Only look at x-ray structures
	keep if experimentalTechnique == "X-RAY DIFFRACTION" 
	count 
	
	* Drop if there are 15+ entities 
	drop if numEntities >= 15
	count
	
	* Must be only structure in paper	
	drop if structuresInPaper > 1 & pubmedId != .
	count
	
	* If no paper, must be only structure in project
	drop if structuresInProject > 1 & pubmedId == .
	count
		
	* Must be the priority structure
	keep if priorityStructure == 1
	count
		
	* Drops observations w/ missing quality outcomes
	foreach outcome in refinementResolution rFree ramaRaw {
		drop if `outcome' == .
	}
	count
	
	* If deposit date is prior to 1999, code it as missing due to changing policies
	replace maturation = . if depositionYear < 1999
	count
	
	* Drop membrane proteins
	drop if membrane_protein == 1
	count

log close
	
* Now that sample is defined, standardize some outcomes

	egen resolutionStd = std(-refinementResolution)
	lab var resolutionStd "Standardized resolution"
	
	egen rFreeStd = std(-rFree)
	lab var rFreeStd "Standardized R-free"
	
	egen ramaRawStd = std(-ramaRaw)
	lab var ramaRawStd "Standardized Ramachandran outliers"
	
	gen index = resolutionStd + rFreeStd + ramaRawStd
	egen indexStd = std(index)
	lab var indexStd "Standardized quality index"
		
keep structureId pubmedId resolutionStd rFreeStd ramaRawStd indexStd
save "${data_built}pdb_analysis.dta", replace

	

